---
title: "R-Code Tutorial: Conducting a Meta-Analysis"
subtitle: "A Step-by-Step Example to Analyzing the Effects of Sulfur Fertilization on Wheat Yield"
author: "Gustavo Roa"
date: "2024-05-21"

---


# Introduction

Meta-analysis is a powerful statistical technique that allows researchers to combine and analyze data from multiple studies to obtain a more comprehensive understanding of a particular research question. In this tutorial, we will demonstrate how to conduct a meta-analysis in R, using the example of investigating the effects of sulfur fertilization on wheat yield and protein content.

# Data Preparation

Before conducting the meta-analysis, it is essential to prepare the data in a suitable format. The data should be organized in a spreadsheet or data frame, with each row representing a single study or comparison, and columns containing relevant information such as study ID, treatment and control means, sample sizes, and measures of variability (e.g., standard deviation or standard error). For this example we will be using the file "Sulfur_datasets_GRoa.xlsx", this dataset includes the compiled data from the 55 studies used in the meta-analysis, which were sourced from published literature.

**Note: Please ensure that there is a folder named "output" in the same directory where your dataset and the R code tutorial file are located. This is necessary for the code to execute properly.**

# Load Necessary Packages

To begin, we need to load the necessary R packages for data wrangling, visualization, and meta-analysis. 

```{r, message  = FALSE, warning=FALSE}
# Required packages

library(tidyverse) # data wrangling and visualization
library(readxl) # read excel files
library(bayestestR) # bootstrap summaries
library(metafor) # meta-analytic model
library(maps) # world maps 
library(parallel) # parallel processing
library(multidplyr) # A Multi-Process 'dplyr'
library(purrr) # Functional Programming Tools

```

# Load Necessary Funtions

The following functions are copied directly from the file functions_sys_reviews.R from [Giordano, N. 2023. Systematic-reviews-in-R. Github](https://github.com/giordanon/Systematic-reviews-in-R)

## Bootstrap Meta-Analysis Model

This function performs a bootstrap meta-analysis, either with or without a moderator variable.

```{r}

# Function for back transforming effect sizes

trans = function(x){
  out = (exp(x)-1)*100
  return(out)
}

# Bootstrap meta-analytic model

bootstrap_rma = function(data, #input data
                         response_variable,  # name of the response variable in between quotations
                         moderator, # name of the moderator variable if any just type in NA
                         boot_num, # number of  bootstrap samples
                         cores = 16, # number of cores available in your computer
                         seed = 1) # seed for replicability
{
  set.seed(seed)
  cluster <- new_cluster(cores)
  cluster_library(cluster, c("metafor","dplyr", "purrr", "modelr", "multidplyr"))
  
  
  
  if (is.na(moderator)) {
    
    system.time(
      data %>%
        rename(.,
               RV = all_of(response_variable)) %>%  
        #Run bootstrapped models
        modelr::bootstrap(n =  boot_num, id = 'boot_num') %>%
        group_by(boot_num) %>%
        multidplyr::partition(cluster) %>% 
        mutate(fit = strap %>% map(~rma(yi = RV,
                                        vi = VAR,
                                        weights = W,
                                        data = .x)
        )
        ) %>% 
        mutate(mod_val = fit %>% map(~data.frame(ESTIM = coef(.x),
                                                 PVAL = summary(.x)$pval)
        )
        )  %>%
        dplyr::select(-fit, -strap) %>%
        dplyr::collect() %>% 
        unnest(mod_val) %>% 
        saveRDS(paste0("output/",response_variable,"_mod.RData"))
    )
    
  }
  
  if (!is.na(moderator)) {
    
    system.time(
      data %>%
        rename(.,
               MOD = all_of(moderator), 
               RV = all_of(response_variable)) %>%  
        #Run bootstrapped models
        modelr::bootstrap(n =  boot_num, id = 'boot_num') %>%
        group_by(boot_num) %>%
        multidplyr::partition(cluster) %>% 
        mutate(fit = strap %>% map(~rma(yi = RV,
                                        vi = VAR,
                                        mods = ~ 0 + MOD, 
                                        weights = W,
                                        data = .x)
        )
        ) %>% 
        mutate(mod_val = fit %>% map(~data.frame(MOD = levels(as.factor(pluck(.x, "data")$MOD)),
                                                 ESTIM = coef(.x),
                                                 PVAL = summary(.x)$pval)
        )
        )  %>%
        dplyr::select(-fit, -strap) %>%
        dplyr::collect() %>% 
        unnest(mod_val) %>% 
        saveRDS(paste0("output/",response_variable,"_",moderator,"_mod.RData"))
    )
  }
}
```

## Summarize Bootstraps

This function summarizes the bootstrap results, providing confidence intervals for the estimated effects.

```{r}

# Summarise bootstraps

`%nin%` <- Negate(`%in%`)

summarise_bootstraps = function(data){  # data output as obteined from bootstrap_rma() function
  
  if ("MOD" %in% colnames(data)) {
    
    boot = data %>% 
      group_by(MOD) %>% 
      summarise_at(vars(ESTIM, PVAL), list(q975 = ~quantile(., 0.975, na.rm=T),
                                           q025 = ~quantile(., 0.025, na.rm=T),
                                           #hdi = ~hdi(.), you might want to use HDI depending on the distribution of the resamples. HDI is worth using it for p-values bootstraps
                                           q500 = ~quantile(.,0.500, na.rm=T))) 
    
  }
  
  if ("MOD" %nin% colnames(data)) {
    
    boot = data %>% 
      ungroup() %>%  
      summarise_at(vars(ESTIM, PVAL), list(q975 = ~quantile(., 0.975, na.rm=T),
                                           q025 = ~quantile(., 0.025, na.rm=T),
                                           #hdi = ~hdi(.), you might want to use HDI depending on the distribution of the resamples. HDI is worth using it for p-values bootstraps
                                           q500 = ~quantile(.,0.500, na.rm=T))) 
    
  }
  
  
  return(boot)
  
}
```

# Load data

Next, we load the dataset containing the compiled data from 55 studies.

```{r data}
raw_data <- read_excel("Sulfur_datasets_GRoa.xlsx" ,sheet = "database")

print(raw_data)
```

# Maps

We can create a map to visualize the geographic distribution of the studies included in the meta-analysis.

```{r fig.width=8, fig.height=4}

map <- ggplot(data = raw_data) + 
  borders("world", colour="#a7a9ac", fill="#e2e3e4") +
  geom_point(aes(x=longitude, y=latitude), color="purple4", size=2) + 
  xlab("Longitude") + ylab("Latitude") + 
  theme_classic(base_size = 12) +
  theme(axis.text = element_text(color = "black"), panel.background = element_rect(colour = "black", linewidth=1))
 
map

```

# Yield

## Prepare and Transform Data: Calculate effect sizes & pooled sample variance

```{r}
# Yield
data_all_yield = 
  raw_data %>% mutate(
  
            # Response Ratio
            RR = log(treated_yield_kg_ha/control_yield_kg_ha),
            
            # Calculate pooled sampling variance
          
            VAR = ( (sd_treated_yield_kg_ha^2)/(rep_or_block*(treated_yield_kg_ha^2)) ) + 
                  ( (sd_control_yield_kg_ha^2)/(rep_or_block*(control_yield_kg_ha^2)) ),
            
            # Weights
            W = 1/VAR            
            )

```

## Aggregate Data by Study

```{r}
# Example data structure with study identifier
data_yield_study <- data_all_yield %>%
  group_by(year, authors, study) %>%
  summarize(
    yi = sum(RR * W) / sum(W),  # Weighted mean effect size
    vi = sum(VAR) / n(),        # Mean variance
    .groups = 'drop', 
    N = n())

```

## Run model - intercept only

```{r}
# Conduct meta-analysis
mod_yield <- rma(yi = yi, 
                 vi = vi, 
                 data = data_yield_study)

print(mod_yield)

# Transform to percentage change

intrcpt <- (exp(coef(mod_yield)) - 1) * 100

print(intrcpt)

```

## Model heterogeneity, publication bias and influential cases

```{r}
# Assess heterogeneity
print(mod_yield$I2)

# Publication bias analysis
funnel(mod_yield)
regtest(mod_yield)
ranktest(mod_yield)

# Diagnostics for potential outliers and influential cases
inf <- influence(mod_yield)
plot(inf)

```

## Forest plot

```{r, fig.width=8, fig.height=11}


forest(mod_yield, xlim=c(-2,1.5), showweights=TRUE, 
       xlab= "Effect on yield (Log)",
       cex=.8, slab=paste(authors, year, sep=", "), addcred = TRUE, header="Author(s) & year", col = "black")
op <- par(cex=.8, font=2)

par(op)
 
text(-2, -1, pos=4, cex=0.8, bquote(paste("RE Model (Q = ",
     .(formatC(mod_yield$QE, digits=1, format="f")), ", df = ", .(mod_yield$k - mod_yield$p),
     ", p = ", .(formatC("< .001", format="f")), "; ", I^2, " = ",
     .(formatC(mod_yield$I2, digits=1, format="f")), "%)")))

text(0.894, -2.3, pos=4, cex=0.8, bquote(paste("(", .(formatC(intrcpt, digits=1, format="f")), "%)")))



```

## Influential Studies Diagnosis

```{r}
 cooks.distance.rma.uni(model = mod_yield, progbar = T) %>% 
   saveRDS("output/cooksD_diagnosis.RData")
 
plot(readRDS("output/cooksD_diagnosis.RData"))

influential_cases = c(2, 3, 4)

data.imp_es_ic = 
  data_all_yield %>% 
  mutate(W = case_when(study %in% influential_cases ~ 0, T~W))

```

## Run unbootstrapped model - Texture

The weight of influential studies is set to zero. This step involves conducting a mixed-effects meta-regression model. Using the texture_group.

```{r}

mod_texture = rma(yi = RR,
          vi = VAR,
          weights = W,
          mods = ~ 0 + texture_group, # change
          data = data.imp_es_ic)


print(mod_texture)


```

## Pooled effects, intercept only model

Bootstrap analysis with at least 1000 bootstrap samples is recommended. Ensure the number of cores available on your computer is used for parallel processing. This may take several minutes to run. 

```{r bootstrap}

ncores <- detectCores()

bootstrap_rma(data = data.imp_es_ic, response_variable = "RR", moderator = NA, boot_num = 1000, cores = ncores)


```

## Test potential moderators

This section tests the effects of potential moderators, in this case, the "texture_group."

```{r moderators}

bootstrap_rma(data = data.imp_es_ic, response_variable = "RR",moderator = "texture_group", boot_num = 1000, cores = ncores) 

```

## Summarize Bootstrap Results

```{r summarise_boot}

df.plot = summarise_bootstraps(readRDS("output/RR_texture_group_mod.RData"))

df.plot$MOD <- factor(df.plot$MOD, levels=c("Loam","Silty", "Clay", "Sandy"))

df.plot

```

## Plot

Generate a plot to visualize the bootstrap results, showing the effect sizes for different texture groups.

```{r fig.width=6, fig.height=4}

FigX <- df.plot %>% 
  ggplot()+
  geom_linerange(aes(xmin = trans(ESTIM_q975), xmax = trans(ESTIM_q025), y = MOD), linewidth = 1)+
  geom_point(aes(y = MOD, x = trans(ESTIM_q500)), shape = 21, size = 6, stroke = 1.2, fill = "#A1A1A1", color = "black") +
  xlim(-4,6)+
  labs(y = "", x = "Effect (%)") + 
  theme_classic(base_size = 14) +
  theme(axis.text = element_text(color = "black")) +
  theme(panel.background = element_rect(colour = "black", linewidth=1.5)) +
guides(fill = "none") +
  ggtitle("Yield")

FigX

```
